Carregar dados

dados_originais <- read.csv("spotify_data.csv")
aux <- sample(1:nrow(dados_originais), size=nrow(dados_originais)*0.005)
dados <- dados_originais[aux,-c(1:4)]

dados2 <- dados[dados$year==2022,]
wines <- dados2[,-c(2,3,6,8,16)]
#variaveis categoricas: c(2,3,6,8,16)

Carregar pacotes

library(dplyr)
library(ggplot2)
library(GGally)
library(corrplot)
library(factoextra)
library(gridExtra)
library(plotly)

Descritiva

Popularity

ggplotly(ggplot(data = dados2, aes(x = popularity)) +
  geom_histogram(color = "black", fill = "steelblue") +
  labs(x = 'Popularidade'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Year

ggplotly(ggplot(data = dados2, aes(x = year)) +
  geom_bar(color = "black", fill = "steelblue") +
  labs(x = 'Ano'))

Danceability

ggplotly(ggplot(data = dados2, aes(x = danceability)) +
  geom_histogram(color = "black", fill = "steelblue") +
  labs(x = 'Dançabilidade',y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Energy

ggplotly(ggplot(data = dados2, aes(x = energy)) +
  geom_histogram(color = "black", fill = "steelblue") +
  labs(x = 'Energia'),y="Frequencia")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Key

ggplotly(ggplot(data = dados2, aes(x = key)) +
             geom_bar(color = "black", fill = "steelblue") +
             scale_x_continuous(breaks=seq(0,11,by=1),labels = c("C","C#","D","D#","E","F","F#","G","G#","A","A#","B")) +
             scale_y_continuous(breaks=seq(0,150000,by=25000),labels=c("0","25","50","75","100","125","150")) +
             labs(x = 'Tom',y='Frequencia'))

Loudness

ggplotly(ggplot(data = dados2, aes(x = loudness)) +
             geom_histogram(color = "black", fill = "steelblue") +
             scale_x_continuous(breaks=seq(15,60,by=5)) +
             scale_y_continuous(breaks=seq(0,300000,by=50000),labels=c("0","50","100","150","200","250","300")) +
             labs(x = 'Decibéis',y = 'Frequencia'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Mode

ggplotly(ggplot(data = dados2, aes(x = mode)) +
  geom_bar(color = "black", fill = "steelblue") +
  scale_x_continuous(breaks = c(0,1),labels = c("Menor","Maior")) +
  scale_y_continuous(breaks=seq(0,700000,by=100000),labels=c("0","100","200","300","400","500","600","700")) +
  labs(x = 'Escala',y = 'Frequencia'))

Speechiness

ggplotly(ggplot(data = dados2, aes(x = speechiness)) +
             geom_histogram(color = "black", fill = "steelblue") +
             scale_y_continuous(breaks=seq(0,600000,by=100000),labels=c("0","100","200","300","400","500","600")) +
             labs(x = 'Fala',y = 'Frequencia'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Acousticness

ggplotly(ggplot(data = dados2, aes(x = acousticness)) +
             geom_histogram(color = "black", fill = "steelblue") +
             scale_y_continuous(breaks=seq(0,400000,by=100000),labels=c("0","100","200","300","400")) +
             labs(x = 'Acusticidade',y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Instrumentalness

ggplotly(ggplot(data = dados2, aes(x = instrumentalness)) +
  geom_histogram(color = "black", fill = "steelblue") +
    scale_y_continuous(breaks=seq(0,700000,by=100000),labels=c("0","100","200","300","400","500","600","700")) +
    labs(x = 'Instrumentalidade',y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Liveness

ggplotly(ggplot(data = dados2, aes(x = liveness)) +
             geom_histogram(color = "black", fill = "steelblue") +
             scale_y_continuous(breaks=seq(0,300000,by=100000),labels=c("0","100","200","300")) +
             labs(x="Presença de audiência",y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Valence

ggplotly(ggplot(data = dados2, aes(x = valence)) +
  geom_histogram(color = "black", fill = "steelblue") +
    labs(x="Valência",y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Tempo

ggplotly(ggplot(data = dados2, aes(x = tempo)) +
              geom_histogram(color = "black", fill = "steelblue") +
              labs(x = 'Bpm',y='Frequencia'))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Duration

ggplotly(ggplot(data = dados2, aes(x = duration_ms)) +
  geom_histogram(color = "black", fill = "steelblue") +
  xlim(c(0,1e+06)) +
  labs(x = 'Duração',y="Frequencia"))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Time Signature

ggplotly(ggplot(data = dados2, aes(x = time_signature)) +
  geom_bar(color = "black", fill = "steelblue") +
  scale_x_continuous(breaks=seq(0,7,by=1)) +
  labs(x = 'Compasso',y="Frequencia"))

Correlação

corrplot(cor(dados[,-c(2,3,6,8,16)]))

Distâncias

Distancia pearson

res.dist <- get_dist(dados2[,-c(2,3,6,8,16)], method = "pearson") # Correlation-based distance method
fviz_dist(res.dist, lab_size = 8) # Visualize the dissimilarity matrix

Distancia euclidiana

res.dist2 <- get_dist(wines, method = "euclidian") # Correlation-based distance method
fviz_dist(res.dist2, lab_size = 8) # Visualize the dissimilarity matrix

Distancia Manhattan

res.dist3 <- get_dist(wines, method = "manhattan") # Correlation-based distance method
fviz_dist(res.dist3, lab_size = 8) # Visualize the dissimilarity matrix

Distancia Minkowski

res.dist4 <- get_dist(wines, method = "minkowski") # Correlation-based distance method
fviz_dist(res.dist4, lab_size = 8) # Visualize the dissimilarity matrix

Grafico interativo

#install.packages("d3heatmap")
library(d3heatmap)
## 
## ======================
## Welcome to d3heatmap version 0.9.0
## 
## Type citation('d3heatmap') for how to cite the package.
## Type ?d3heatmap for the main documentation.
## 
## The github page is: https://github.com/talgalili/d3heatmap/
## Please submit your suggestions and bug-reports at: https://github.com/talgalili/d3heatmap/issues
## You may ask questions at stackoverflow, use the r and d3heatmap tags: 
##   https://stackoverflow.com/questions/tagged/d3heatmap
## ======================
## 
## Attaching package: 'd3heatmap'
## The following objects are masked from 'package:base':
## 
##     print, save
d3heatmap(scale(wines), colors = "RdYlBu")
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette RdYlBu is 11
## Returning the palette you asked for with that many colors
## Warning: Some values were outside the color scale and will be treated as NA
## Warning in RColorBrewer::brewer.pal(n, pal): n too large, allowed maximum for palette RdYlBu is 11
## Returning the palette you asked for with that many colors

Dendograma

hc <- eclust(wines, "hclust", hc_metric = "euclidian", hc_method = "ward.D", k=100)
## Warning: The `<scale>` argument of `guides()` cannot be `FALSE`. Use "none" instead as
## of ggplot2 3.3.4.
## ℹ The deprecated feature was likely used in the factoextra package.
##   Please report the issue at <]8;;https://github.com/kassambara/factoextra/issueshttps://github.com/kassambara/factoextra/issues]8;;>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
fviz_dend(hc, rect = TRUE)

fviz_dend(hclust(dist(wines)))

K-Means Clustering

# normalização das variáveis 
dadosNorm <- as.data.frame(scale(dados2[,-c(2,3,6,8,16)]))

set.seed(1234)
dados_k2 <- kmeans(dadosNorm, centers = 2)
dados_k2$size
## [1]  49 214
dados_k2$size/nrow(dados)
## [1] 0.00845119 0.03690928

O primeiro cluster tem 4477 (0.77%) observações, enquanto que o segundo tem 1321 (0.23%).

aggregate(dados2[,-c(2,3,6,8,16)], by=list(dados_k2$cluster), mean)
##   Group.1 popularity danceability    energy   loudness speechiness acousticness
## 1       1   28.95918    0.3948367 0.2740090 -19.388000  0.05287551    0.7914496
## 2       2   31.40654    0.5742710 0.7610327  -6.389164  0.10392243    0.1981805
##   instrumentalness  liveness   valence    tempo duration_ms
## 1        0.6218577 0.2226163 0.2127294 100.4153    176786.1
## 2        0.1654614 0.2384033 0.4703551 128.0888    217644.9

Otimizando k

# quantos clusters? 

bss <- numeric()
wss <- numeric()


for(i in 1:10){
  
  # For each k, calculate betweenss and tot.withinss
  bss[i] <- kmeans(dadosNorm, centers=i)$betweenss
  wss[i] <- kmeans(dadosNorm, centers=i)$tot.withinss
  
}

# Between-cluster sum of squares vs Choice of k
p3 <- qplot(1:10, bss, geom=c("point", "line"), 
            xlab="Number of clusters", ylab="Between-cluster sum of squares") +
  scale_x_continuous(breaks=seq(0, 10, 1)) +
  theme_bw()

# Total within-cluster sum of squares vs Choice of k
p4 <- qplot(1:10, wss, geom=c("point", "line"),
            xlab="Number of clusters", ylab="Total within-cluster sum of squares") +
  scale_x_continuous(breaks=seq(0, 10, 1)) +
  theme_bw()

# Subplot
grid.arrange(p3, p4, ncol=2)

A partir das somas de quadrados, 2 ou 3 clusters seriam ideais. O ganho da inclusao de mais clusters e em geral similar, com excecao a diferença entre 6 e 7 clusters.

Plots